#!/usr/bin/env perl
use strict;
use warnings;
use Data::Dumper;

###############GLOBAL VARIABLES################################
	
my $HG19TOHG18CHAIN="/home/yunfeiguo/Downloads/liftover/hg19ToHg18.over.chain"; 
my $HG18TOHG19CHAIN="/home/yunfeiguo/Downloads/liftover/hg18ToHg19.over.chain";
my $LIFTOVER="liftOver";
#chromosome naming rule
#1KG
my @ONEKG_CHR=qw(1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y MT XY);
#UCSC
my @UCSC_CHR=qw(chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chrX chrY chrM chrXY);
#NUMERICAL
my @INT_CHR=qw(1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26);

my %ALL_NAMING=('numerical'=>\@INT_CHR,'1KG'=>\@ONEKG_CHR,'UCSC'=>\@UCSC_CHR);
###############END OF GLOBAL VARIABLES################################

#convert hg18 to hg19 or hg19 to hg18
#accept any input as long as method to decipher chr:pos is given
#accept any chromosome naming (numerical, 1KG, UCSC)
#
my $usage="any_hg_convert <hg18|hg19> <format:bed|matrix> <chr naming> <bin> <file1 file2 ...>\n".
	  "hg18 or hg19 refers to the build you want convert TO.\n".
	  "<chr naming> numerical, 1KG, UCSC\n".
	  "<bin> integer for BIN column, 0 for none, 1 for exist (first column). No other integer allowed.\n";
die "$usage\n" unless @ARGV>=4;
my $build=shift @ARGV;
my $format=shift @ARGV;
my $naming=shift @ARGV;
my $bin=shift @ARGV;
my @files=@ARGV;

die "ERROR: hg18 or hg19 build only\n" unless $build eq 'hg18' or $build eq 'hg19';
die "ERROR: matrix, bed only\n" unless $format eq 'matrix' or $format eq 'bed';
die "ERROR: ",join (" ",keys %ALL_NAMING)," only\n" unless defined $ALL_NAMING{$naming};
die "ERROR: 0 or 1 only\n" unless $bin eq '0' or $bin eq '1';

warn "NOTICE: Converting @files\n";
warn "NOTICE: Convert to build $build\n";
warn "NOTICE: Input in $format format\n";
warn "NOTICE: assume 1st col is colname, 1st row is rowname\n" if $format eq 'matrix';
warn "NOTICE: assume column and row names look like chr:pos-pos\n" if $format eq 'matrix';
warn "WARNING: File is assumed to be tab-delimited without checking\n";

for my $i(@files)
{
    &do_convert($build,$format,$naming,$bin,$i);
}

#####################SUBROUTINES###############

#convert file to the specified build in $format
sub do_convert
{
    my $build=shift;
    my $format=shift;
    my $naming=shift;
    my $bin=shift;
    my $file=shift;
    my %tochr;
    my %backchr;
    my $chain=$HG18TOHG19CHAIN;

    @tochr{@{$ALL_NAMING{$naming}}}=@UCSC_CHR;
    @backchr{@UCSC_CHR}=@{$ALL_NAMING{$naming}};

    if ($build eq 'hg18')
    {
	$chain=$HG19TOHG18CHAIN;
    }

    if ($bin eq '1'and $format eq 'matrix')
    {
	die "ERROR: No BIN column allowed for matrix format\n";
    }
    if ($format eq 'bed')
    {
	&convertBED({file=>$file,tochr=>\%tochr,backchr=>\%backchr,chain=>$chain,bin=>$bin,build=>$build});
    } elsif ($format eq 'matrix')
    {
	&convertMatrix({file=>$file,tochr=>\%tochr,backchr=>\%backchr,chain=>$chain,build=>$build});
    }
}

#convert input in BED to the other build
sub convertBED
{
    my $config=shift;
    my $chr_converted="/tmp/".rand($$).".chrconverted";
    my $hg_converted="/tmp/".rand($$).".hgconverted";
    my $output=$config->{file}.$config->{build};

    &convertCHR($config->{tochr},$config->{file},$chr_converted,$config->{bin});

    !system("$LIFTOVER -bedPlus=3 -tab ".($config->{bin} eq '1'?"-hasBin":"")." $chr_converted $config->{chain} $hg_converted /tmp/$$.liftover.tmp ") or die "Failed to lift over: $!\n";
    &convertCHR($config->{backchr},$hg_converted,$output,$config->{bin});
    warn "NOTICE: $output done\n";
}

#convert input in Matrix format to the other build
sub convertMatrix
{
       my $config=shift;
       my $extracted_chr="/tmp/".rand($$).".extracted_chr"; #extract chr:pos-pos info from matrix, in BED
       my $chr_converted="/tmp/".rand($$).".chrconverted";
       my $hg_converted="/tmp/".rand($$).".hgconverted";
       my $extracted_chr_output="/tmp/".rand($$).".extracted_chr_output";
       my $output=$config->{file}.$config->{build};

       &extractCHR($config->{file},$extracted_chr);
       &convertCHR($config->{tochr},$extracted_chr,$chr_converted);
    
       !system("$LIFTOVER -bedPlus=3 -tab $chr_converted $config->{chain} $hg_converted /tmp/$$.liftover.tmp ") or die "Failed to lift over: $!\n";
       &convertCHR($config->{backchr},$hg_converted,$extracted_chr_output);
       &fillCHR($extracted_chr_output,$config->{file},$output);

       warn "NOTICE: $output done\n";
}
#extract chr:pos-pos from colname of input
sub extractCHR
{
    my $in=shift;
    my $out=shift;
    open IN,'<',$in or die "$in: $!\n";
    open OUT,'>',$out or die "$out: $!\n";

    while(<IN>)
    {
	#only look at the 1st line
	s/^\s+$//g;
	#colname
	my @f=split /\t/;
	for my $i(@f[1..$#f])
	{
	    my ($chr,$pos1,$pos2)= $i=~/(.*?):(\d+?)-(\d+)/;
	    die "ERROR: unable to parse colname ($i) in $in\n" unless defined $chr and defined $pos1 and defined $pos2;
	    print OUT "$chr\t$pos1\t$pos2\t$i\n"; #4th column serves as key in case some regions are unmapped during liftover
	}
	last;
    }
    close IN;
    close OUT;
}
#put chr and pos back to colname and rowname of input, and therefore generate output
sub fillCHR
{
    my $chr=shift;
    my $template=shift;
    my $out=shift;
    my %chr_with_key;
    my @colname;
    my $unmap_count; #count how many colnames are unmapped during liftover
    my %unlink_col; #record col number to remove in output
    my @output_col; #columns to stay
    open CHR,'<',$chr or die "$chr: $!\n";
    open TEMPLATE,'<',$template or die "$template: $!\n";
    open OUT,'>',$out or die "$out: $!\n";

    while(<CHR>)
    {
	s/\s+$//;
	my @f=split /\t/;
	$chr_with_key{$f[3]}=[@f[0..2]];
    }
    close CHR;

    $.=0; #reset line counter in case it's not yet
    while(<TEMPLATE>)
    {
	s/\s+$//;
	my @f=split /\t/;
	if($.==1)
	{#colname
	    @colname=@f;
	    for my $i(1..$#f)
	    {
		if(defined $chr_with_key{$f[$i]})
		{
		    my ($chr,$pos1,$pos2)= @{$chr_with_key{$f[$i]}};
		    $f[$i]="$chr:$pos1-$pos2";
		}
		else
		{
		    $unlink_col{$i}=1;#start from 1!
		}
	    }
	    @output_col=grep {!$unlink_col{$_}} (0..$#colname);
	}
	else
	{
	    die "ERROR: unequal number of fields at line $. of $template\n" unless @f==@colname;
	    next if $unlink_col{$.-1};
	    die "ERROR: rowname colname inconsistent at line $. of $template\n" unless $f[0] eq $colname[$.-1];
	    my ($chr,$pos1,$pos2)= @{$chr_with_key{$f[0]}};
	    $f[0]="$chr:$pos1-$pos2";
	}
	print OUT join("\t",@f[@output_col]),"\n";
    }
    close TEMPLATE;
    close OUT;
}
#convert chr based on %chr, assume format is BED
sub convertCHR
{
    my $hash_ref=shift;
    my $from=shift;
    my $to=shift;
    my $bin=shift;
    open IN,'<',$from or die "$from: $!\n";
    open OUT,'>',$to or die "$to: $!\n";

    while(<IN>)
    {
	next if /^#|^\s*$/;
	s/\s+$//g;
	my @f=split /\t/;
	if(defined $bin && $bin eq '1')
	{
	    die "ERROR: Expected at least 4 fields in $from\n" unless @f>=4;
	    die "ERROR: unrecognized chromosome name ($f[1]) in $from!\n" unless defined $hash_ref->{$f[1]};
	    $f[1]=$hash_ref->{$f[1]};
	}
	else
	{
	    die "ERROR: Expected at least 3 fields in $from\n" unless @f>=3;
	    die "ERROR: unrecognized chromosome name ($f[0]) in $from!\n" unless defined $hash_ref->{$f[0]};
	    $f[0]=$hash_ref->{$f[0]};
	}
	print OUT join ("\t",@f),"\n";
    }
    close IN;
    close OUT;
}
